In [51]:
import plotly.plotly as py
import cufflinks as cf
import pandas as pd
import numpy as np

cf.set_config_file(offline=True, world_readable=False, theme='ggplot')

df_d = pd.read_csv("days_data.csv")
df_m = pd.read_csv("minute_data.csv")
df_s = pd.read_csv("second_data.csv")

Day

Each row is a single day

In [2]:
df_d.head(5)
Out[2]:
Unnamed: 0 count day_of_week
0 0 8.822189e+06 Saturday
1 1 1.065390e+07 Sunday
2 2 1.220661e+07 Monday
3 3 1.107984e+07 Tuesday
4 4 1.197607e+07 Wednesday
In [3]:
df_d.iplot(y="count")

Minute

Each row is a single minute

In [4]:
df_m.head(5)
Out[4]:
Unnamed: 0 count
0 0 3659
1 1 3465
2 2 3159
3 3 3128
4 4 3544
In [5]:
df_m.iplot(y="count")

Second

Each row is a single second

In [6]:
df_s.head(5)
Out[6]:
Unnamed: 0 count
0 0 87
1 1 76
2 2 81
3 3 110
4 4 78
In [7]:
df_s.iplot(y="count")

Example: Mutations on Per Minute Data

In [8]:
import datetime
import random
from itertools import cycle
import random
import seaborn as sns
sns.set(rc={'figure.figsize':(11.7,8.27)})

# Keep getting data from our df_m (minute) data in a circular way
# This allows us to generate data and keep repeating until we run out of days in our dataset
def get_circular_data():
    circular_list = cycle(df_m["count"].values)
    return circular_list

def generate_data(start, end, step, my_func):
    dt_first_timestamp_in_sec = int(start.timestamp())
    dt_first_timestamp_in_sec

    result = []
    while start < end:
        timestamp = start.strftime('%Y-%m-%d %H:%M:%S')
        result.append({"timestamp":timestamp,
                       "value": next(my_func)
                      })
        start += step
    return result

# Generate some data from Feb 1st 2019 to Feb 5th 2019
start = datetime.datetime(2019, 2, 1)
end = datetime.datetime(2019, 2, 5, 23, 59, 59)
step = datetime.timedelta(minutes=1) # By minutes (because we're using our minute dataset)
In [9]:
df = pd.DataFrame(generate_data(start,end,step,get_circular_data()))
In [10]:
df.iplot(y="value")
In [11]:
df.head()
Out[11]:
timestamp value
0 2019-02-01 00:00:00 3659
1 2019-02-01 00:01:00 3465
2 2019-02-01 00:02:00 3159
3 2019-02-01 00:03:00 3128
4 2019-02-01 00:04:00 3544
In [53]:
# Latency is loosely related to traffic volume
#  Although good systems don't see an increase in latency when the traffic increases
def latency(value):
    latency = 500 # 100 ms is the default
    
    if value > 9000:
        latency = latency * (1.01+random.uniform(0, 1))
    if value > 11000:
        latency = latency * (1.09+random.uniform(0, 1))
    if value < 5000:
        latency = latency * (0.7+random.uniform(0, 1))
    
    return latency
    
# We don't want to see many database errors, so let's simulate rare occurences
def db_errors(value, host_num):
    if host_num == 1:
        1 if random.randint(0,100000) > 77777 else np.NaN
    if host_num == 2:
        1 if random.randint(0,100000) > 99998 else np.NaN
    if host_num == 3:
        1 if random.randint(0,1000000) > 999999 else np.NaN
    
    return 1 if random.randint(0,100) > 99 else np.NaN
    
    
df["http_500"] = df["value"]*.05 # Server Error
df["http_404"] = df["value"]*.1 # Page not found
df["http_200"] = df["value"]*5 # OK

df["login_success"] = df["value"]*.6
df["latency"] = df["value"].apply(lambda x: latency(x))


df["db_errors_host01"] = df["value"].apply(lambda x: db_errors(x,1))
df["db_errors_host02"] = df["value"].apply(lambda x: db_errors(x,2))
df["db_errors_host03"] = df["value"].apply(lambda x: db_errors(x,3))
In [54]:
df.head()
Out[54]:
timestamp value http_500 http_404 http_200 login_success latency db_errors_host01 db_errors_host02 db_errors_host03
timestamp
2019-02-01 00:00:00 2019-02-01 00:00:00 3659 182.95 365.9 18295 2195.4 371.643816 NaN NaN NaN
2019-02-01 00:01:00 2019-02-01 00:01:00 3465 173.25 346.5 17325 2079.0 548.694264 NaN NaN NaN
2019-02-01 00:02:00 2019-02-01 00:02:00 3159 157.95 315.9 15795 1895.4 437.684944 NaN NaN NaN
2019-02-01 00:03:00 2019-02-01 00:03:00 3128 156.40 312.8 15640 1876.8 669.331753 NaN NaN NaN
2019-02-01 00:04:00 2019-02-01 00:04:00 3544 177.20 354.4 17720 2126.4 353.212162 NaN NaN NaN
In [55]:
df.iplot(x="timestamp", y=["http_500", "http_404"])
In [56]:
df.iplot(x="timestamp", y="latency")
In [57]:
# We should less less logins per HTTP 200 since 200 OKs happen a lot
df.iplot(x="timestamp", y=["login_success", "http_200"])
In [62]:
df.index = pd.to_datetime(df["timestamp"])

# Down-sample every 60 seconds
resampled = df[["timestamp","db_errors_host01","db_errors_host02","db_errors_host03"]].resample("60T").sum().reset_index()
resampled.iplot(kind="scatter", mode="markers", x="timestamp")